This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. Html_fragmented is quicker to load than html, hence for this pipeline which genenerated a large mount of data, we have used html_fragment to report Rmarkdown.
#1) The input data for this code was obtained from htseq-count data obtained from processing raw fastq files #2) Input expression files are SRR(number).txt htseq-count from RNA-seq analysis folder for GSE61915, GSE73503 and GSE83931 #3) metadata file made by manually combining individual metadata files form GEO to give ‘GSE61915_GSE73503_GSE83931_metadata_edit.csv’ #4) metadata coded file made manually from ‘merge_GSE61915_GSE73503_GSE83931_metadata_SampleID.csv’ outputed at end of merge code Step5 to give ‘merge_GSE61915_GSE73503_GSE83931_metadata_SampleID_coded’ code used is Study GSE61915=1, GSE73503=2, GSE83931=3; Gender Female=1, Male=2, MaleFemale=3 #5) run code till step 22 and then based on cell type and trait associated module replace in code “yellow” or “MsAge7K_04” with module name that is associated with trait or disease of interest. And replace hub gene names. #6) WGCNA assigns colors randomly (except few reserved colors like “grey” for unassigned genes). Therefore, upon re-run of this analysis it may call the age associated microglia enriched “yellow” module by some other color such as “brown”, in which case this color is specified step 23 onwards as described in point 5) above. #7) The present analysis was done on MacOS, using knitToHtmlfragment.
#8) To reuse this code for other datasets a) replace 1) and 2) input files above with equivalent files for the dataset b) replace “MsAge7K_” for module naming to pre-fix of users choice that is meaningful for the dataset. Here Ms=Mouse, Age=Age c) this pipeline uses Human gene symbols
merging htseq-count data for RNA-seq mouse GSE61915, GSE73503 and GSE83931
#save working directory location
wd<-getwd()
wd
## [1] "/Users/powermac/Desktop/MsAge11-30-18"
#Install packages by uncommenting two lines below if packages not already installed before
#source("https://bioconductor.org/biocLite.R")
##biocLite(c("stringr", "reshape2", "dplyr", "ggplot2", "magrittr", "edgeR", "DT))
library(stringr)
library(reshape2)
library(dplyr)
library(ggplot2)
library(magrittr)
library(edgeR)
library(DT)
setwd("./InputHtseqCount")
#Be careful to check that the input pattern here is such that only the count data and no other files are selected
htseqcountfiles <- list.files(pattern = "*.txt")
htseqcountfiles
## [1] "SRR1593496.txt" "SRR1593497.txt" "SRR1593498.txt" "SRR1593499.txt"
## [5] "SRR1593500.txt" "SRR1593501.txt" "SRR1593502.txt" "SRR1593503.txt"
## [9] "SRR1593504.txt" "SRR1593505.txt" "SRR1593506.txt" "SRR1593507.txt"
## [13] "SRR1593508.txt" "SRR1593509.txt" "SRR1593510.txt" "SRR1593511.txt"
## [17] "SRR1593512.txt" "SRR2531532.txt" "SRR2531533.txt" "SRR2531534.txt"
## [21] "SRR2531535.txt" "SRR2531536.txt" "SRR2531537.txt" "SRR2531538.txt"
## [25] "SRR2531539.txt" "SRR2531540.txt" "SRR2531541.txt" "SRR2531542.txt"
## [29] "SRR2531543.txt" "SRR2531544.txt" "SRR2531545.txt" "SRR2531546.txt"
## [33] "SRR2531547.txt" "SRR2531548.txt" "SRR2531549.txt" "SRR2531550.txt"
## [37] "SRR2531551.txt" "SRR2531552.txt" "SRR2531553.txt" "SRR2531554.txt"
## [41] "SRR2531555.txt" "SRR3734796.txt" "SRR3734797.txt" "SRR3734798.txt"
## [45] "SRR3734799.txt" "SRR3734800.txt" "SRR3734801.txt" "SRR3734802.txt"
## [49] "SRR3734803.txt" "SRR3734804.txt" "SRR3734805.txt" "SRR3734806.txt"
## [53] "SRR3734807.txt" "SRR3734808.txt" "SRR3734809.txt" "SRR3734810.txt"
## [57] "SRR3734811.txt" "SRR3734812.txt" "SRR3734813.txt" "SRR3734814.txt"
## [61] "SRR3734815.txt" "SRR3734816.txt" "SRR3734817.txt" "SRR3734818.txt"
## [65] "SRR3734819.txt" "SRR3734820.txt" "SRR3734821.txt" "SRR3734822.txt"
## [69] "SRR3734823.txt" "SRR3734824.txt" "SRR3734825.txt"
setwd(wd)
setwd("./InputHtseqCount")
#Given that the above files were imported properly as shown above, lets read the entire list of files into a single object
readhtseqcountfiles = lapply(htseqcountfiles, read.table, sep="\t",fill=TRUE)
setwd(wd)
#Now that all the files in the list have been 'read' we can merge them by 'V1' column which has gene symbols
htseqCount <- Reduce(function(x, y) {
merge(x, y, all=TRUE, by="V1")
}, readhtseqcountfiles)
#Removal of extra htseqcount information
htseqCount1=htseqCount[6:53677,]
#Gene names to row names
rownames(htseqCount1)=htseqCount1$V1
htseqCount1$V1=NULL
#Sample names as colnames
htseqCount2 <- htseqCount1
colnames(htseqCount2)<-htseqcountfiles
colnames(htseqCount2) <- gsub(".txt","",colnames(htseqCount2))
#Here we just eyeball and make sure that the order of the samples here is same as the order of the input sample htseqcount files in the first code chunk i.e. 'htseqcountfiles <- list.files(pattern = "*.txt")'
htseqcountfiles
## [1] "SRR1593496.txt" "SRR1593497.txt" "SRR1593498.txt" "SRR1593499.txt"
## [5] "SRR1593500.txt" "SRR1593501.txt" "SRR1593502.txt" "SRR1593503.txt"
## [9] "SRR1593504.txt" "SRR1593505.txt" "SRR1593506.txt" "SRR1593507.txt"
## [13] "SRR1593508.txt" "SRR1593509.txt" "SRR1593510.txt" "SRR1593511.txt"
## [17] "SRR1593512.txt" "SRR2531532.txt" "SRR2531533.txt" "SRR2531534.txt"
## [21] "SRR2531535.txt" "SRR2531536.txt" "SRR2531537.txt" "SRR2531538.txt"
## [25] "SRR2531539.txt" "SRR2531540.txt" "SRR2531541.txt" "SRR2531542.txt"
## [29] "SRR2531543.txt" "SRR2531544.txt" "SRR2531545.txt" "SRR2531546.txt"
## [33] "SRR2531547.txt" "SRR2531548.txt" "SRR2531549.txt" "SRR2531550.txt"
## [37] "SRR2531551.txt" "SRR2531552.txt" "SRR2531553.txt" "SRR2531554.txt"
## [41] "SRR2531555.txt" "SRR3734796.txt" "SRR3734797.txt" "SRR3734798.txt"
## [45] "SRR3734799.txt" "SRR3734800.txt" "SRR3734801.txt" "SRR3734802.txt"
## [49] "SRR3734803.txt" "SRR3734804.txt" "SRR3734805.txt" "SRR3734806.txt"
## [53] "SRR3734807.txt" "SRR3734808.txt" "SRR3734809.txt" "SRR3734810.txt"
## [57] "SRR3734811.txt" "SRR3734812.txt" "SRR3734813.txt" "SRR3734814.txt"
## [61] "SRR3734815.txt" "SRR3734816.txt" "SRR3734817.txt" "SRR3734818.txt"
## [65] "SRR3734819.txt" "SRR3734820.txt" "SRR3734821.txt" "SRR3734822.txt"
## [69] "SRR3734823.txt" "SRR3734824.txt" "SRR3734825.txt"
colnames(htseqCount2)
## [1] "SRR1593496" "SRR1593497" "SRR1593498" "SRR1593499" "SRR1593500"
## [6] "SRR1593501" "SRR1593502" "SRR1593503" "SRR1593504" "SRR1593505"
## [11] "SRR1593506" "SRR1593507" "SRR1593508" "SRR1593509" "SRR1593510"
## [16] "SRR1593511" "SRR1593512" "SRR2531532" "SRR2531533" "SRR2531534"
## [21] "SRR2531535" "SRR2531536" "SRR2531537" "SRR2531538" "SRR2531539"
## [26] "SRR2531540" "SRR2531541" "SRR2531542" "SRR2531543" "SRR2531544"
## [31] "SRR2531545" "SRR2531546" "SRR2531547" "SRR2531548" "SRR2531549"
## [36] "SRR2531550" "SRR2531551" "SRR2531552" "SRR2531553" "SRR2531554"
## [41] "SRR2531555" "SRR3734796" "SRR3734797" "SRR3734798" "SRR3734799"
## [46] "SRR3734800" "SRR3734801" "SRR3734802" "SRR3734803" "SRR3734804"
## [51] "SRR3734805" "SRR3734806" "SRR3734807" "SRR3734808" "SRR3734809"
## [56] "SRR3734810" "SRR3734811" "SRR3734812" "SRR3734813" "SRR3734814"
## [61] "SRR3734815" "SRR3734816" "SRR3734817" "SRR3734818" "SRR3734819"
## [66] "SRR3734820" "SRR3734821" "SRR3734822" "SRR3734823" "SRR3734824"
## [71] "SRR3734825"
#72 columns i.e. 71 samples + 1 column of genes, number of rows is the total number of gene annotations
dim(htseqCount2)
## [1] 53672 71
#Also can check that this is correct by looking at the first gene expression here '0610005C13Rik' in the individual htseqcount files SRR1593496.txt and SRR1593497.txt. I checked and it looks ok.
DT::datatable(head(htseqCount2))
merge_metadata=read.csv('./InputMsAge/GSE61915_GSE73503_GSE83931_metadata_edit.csv', header =T, sep=',')
DT::datatable(head(merge_metadata))
#Number of rows of above metadata is same as number of columns of Expr
dim(htseqCount2)
## [1] 53672 71
dim(merge_metadata)
## [1] 71 6
#replace the column names in expression with sample names
colnames(htseqCount2) <- merge_metadata$Sample_name[match(colnames(htseqCount2), merge_metadata$Run)]
DT::datatable(head(htseqCount2))
#Also can check that this is correct by looking at the first gene expression here '0610005C13Rik' in the individual htseqcount files SRR1593496.txt and SRR1593497.txt. I checked and it looks ok.
#Lets drop the Run number from the metadata now
merge_metadata=merge_metadata[,-c(1)]
DT::datatable(head(merge_metadata))
#edgeR recommends to remove genes without atleast 1 cpm in >= no. of samples in minimum sample group. For us it is 3 for 29 months old mouse sample
Keep<-rowSums(cpm(htseqCount2) > 1) >= 3
htseqCount2_keep<-htseqCount2[Keep, ]
dim(htseqCount2)
## [1] 53672 71
dim(htseqCount2_keep)
## [1] 16769 71
#We remove 53K-16K=37K low genes
#cpm normalize for library size differences. Alternatives are RPKM or TPM normalization.
htseqCount2_keepcpm=cpm(htseqCount2_keep, normalized.lib.sizes=TRUE)
DT::datatable(htseqCount2_keepcpm[1:7,1:7])
#log2+1 transform. Our Human array data is also in log2+1 space so now RNA-seq mouse will be consistent with that
htseqCount2_cpmlog=log2(htseqCount2_keepcpm+1)
DT::datatable(htseqCount2_cpmlog[1:7,1:7])
#4.1 Check for negative values. Negative values will not work in WGCNA
exprRaw_ifneg<-apply(htseqCount2, 1, function(row) any(row <0))
length(which(exprRaw_ifneg)) #what is the length of negative numbers
## [1] 0
exprRaw1_ifneg<-apply(htseqCount2_keep, 1, function(row) any(row <0))
length(which(exprRaw1_ifneg)) #what is the length of negative numbers
## [1] 0
exprCpm_ifneg<-apply(htseqCount2_keepcpm, 1, function(row) any(row <0))
length(which(exprCpm_ifneg)) #what is the length of negative numbers
## [1] 0
exprCpmLog_ifneg<-apply(htseqCount2_cpmlog, 1, function(row) any(row <0))
length(which(exprCpmLog_ifneg)) #what is the length of negative numbers
## [1] 0
#0 means no negative values
#4.2 Check for max and min values. Ususally don’t want max to be <100
exprRaw_max<-which.max(as.matrix(htseqCount2))
exprRaw_max
## [1] 1705292
exprRaw_min<-which.min(as.matrix(htseqCount2))
exprRaw_min
## [1] 2
exprRaw1_max<-which.max(as.matrix(htseqCount2_keep))
exprRaw1_max
## [1] 529693
exprRaw1_min<-which.min(as.matrix(htseqCount2_keep))
exprRaw1_min
## [1] 183
exprCpm_max<-which.max(as.matrix(htseqCount2_keepcpm))
exprCpm_max
## [1] 378772
exprCpm_min<-which.min(as.matrix(htseqCount2_keepcpm))
exprCpm_min
## [1] 183
exprCpmLog_max<-which.max(as.matrix(htseqCount2_cpmlog))
exprCpmLog_max
## [1] 378772
exprCpmLog_min<-which.min(as.matrix(htseqCount2_cpmlog))
exprCpmLog_min
## [1] 183
#4.3 Descriptive or summary statistics of the data
DT::datatable(summary(htseqCount2))
DT::datatable(summary(htseqCount2_keep))
DT::datatable(summary(htseqCount2_keepcpm))
DT::datatable(summary(htseqCount2_cpmlog))
#4.4 Visualization of merged GSE61915 GSE73503 and GSE83931
pdf(file="Visualization_GSE61915_GSE73503_GSE83931_rawdata_transformation.pdf",height=5,width=5)
par(mfrow=c(2,2)) #plots are big, so put one per page
#exprRaw boxplot
par(mai=c(1,0.8,1,0.8))
boxplot(htseqCount2, outline=FALSE, las=2, cex=0.25, main="exprRaw", col="yellow")
#exprRaw1 boxplot
par(mai=c(1,0.8,1,0.8))
boxplot(htseqCount2_keep, outline=FALSE, las=2, cex=0.25, main="exprRaw", col="yellow")
#exprCpm boxplot
par(mai=c(1,0.8,1,0.8))
boxplot(htseqCount2_keepcpm, outline=FALSE, las=2, cex=0.25, main="exprCpm", col="yellow")
#exprCpmLog boxplot
par(mai=c(1,0.8,1,0.8))
boxplot(htseqCount2_cpmlog, outline=FALSE, las=2, cex=0.25, main="exprCpmLog", col="yellow")
dev.off()
## quartz_off_screen
## 2
write.csv(htseqCount2_cpmlog,"merge_GSE61915_GSE73503_GSE83931_Expr_GeneMs_SampleID.csv")
write.csv(merge_metadata,"merge_GSE61915_GSE73503_GSE83931_metadata_SampleID.csv")
#The files we needed are saved so we now clear workspace and delete files and folders that are not needed
save.image(file="preSVALM_temp.RData")
rm(list=ls())
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 827533 44.2 1467998 78.4 NA 1467998 78.4
## Vcells 1532134 11.7 27065806 206.5 102400 33830409 258.2
SVA+LM normalization
#save working directory location
wd<-getwd()
wd
## [1] "/Users/powermac/Desktop/MsAge11-30-18"
#source("https://bioconductor.org/biocLite.R")
#biocLite(c("sva","limma","bladderbatch","pamr"))
library(limma)
library(sva)
library(pamr)
library(ggplot2) # plot
library(ggmap)#plots
library(gplots)#plots
library(RColorBrewer)#color pallet
library(Hmisc)#for corelation plot
library(viridis)#for corelation plot
library(corrplot)#for corelation plot
#Metadata_coded
metadata_coded=read.csv("./InputMsAge/merge_GSE61915_GSE73503_GSE83931_metadata_SampleID_coded.csv", header=T, sep=',')
#Expression data
Expr_MsGene=read.csv("merge_GSE61915_GSE73503_GSE83931_Expr_GeneMs_SampleID.csv", header =T, sep=',')
#Rename the gene symbol column from X to GeneMs
colnames(Expr_MsGene)[colnames(Expr_MsGene) == 'X'] <- 'GeneMs'
dim(metadata_coded)
## [1] 71 6
dim(Expr_MsGene)
## [1] 16769 72
#Need to make sure that the Gene IDs column labeled as GeneMs are unique to specify rownames with Gene IDs
Expr_MsGene = Expr_MsGene[!duplicated(Expr_MsGene$GeneMs),]
dim(metadata_coded)
## [1] 71 6
dim(Expr_MsGene)
## [1] 16769 72
#We lost no Gene IDs as they are no duplicates in RNA-seq unline microarray
Expr_MsGene_1=Expr_MsGene[,-1] #get rid of extra column
rownames(Expr_MsGene_1)=Expr_MsGene$GeneMs
Expr_MsGene_1=Expr_MsGene_1[complete.cases(Expr_MsGene_1), ]
#Filter low counts or else svaobj step compalins about missing values and all zero rows
Expr_MsGene_2=Expr_MsGene_1[rowSums(Expr_MsGene_1)>0,]
edata=Expr_MsGene_2
metadata_coded_1=metadata_coded[,3:5]
rownames(metadata_coded_1)=metadata_coded$Sample_name
pheno=metadata_coded_1
DT::datatable(pheno)
DT::datatable(edata[1:7,1:7])
dim(metadata_coded)
## [1] 71 6
dim(pheno)
## [1] 71 3
dim(Expr_MsGene)
## [1] 16769 72
dim(edata)
## [1] 16769 71
#Make sure we did not lose any samples in expression data
#Here in the output the 1st two and the last two should be same i.e. the first and last samples are same
colnames(Expr_MsGene[2])
## [1] "MF_3_SRR1593496"
colnames(edata[1])
## [1] "MF_3_SRR1593496"
colnames(Expr_MsGene[72])
## [1] "M_4_SRR3734825"
colnames(edata[71])
## [1] "M_4_SRR3734825"
#full model matrix, variable of interest and other variables
#Put categorical variables ahead of numeric variables
#For this dataset there is only one Tissue type so its not included in the model
mod = model.matrix(~Study+Gender+Age, data=pheno)
#null model matrix all except the variable of interest
#To include only an intercept use mode 'mod0 = model.matrix(~1,data=pheno)'
#Put categorical variables ahead of numeric variables
mod0 = model.matrix(~Study+Gender,data=pheno)
#Estimation of Surrogate variable
n.sv = num.sv(as.matrix(edata),mod,method="be")
svobj = sva(as.matrix(edata),mod,mod0,n.sv=n.sv, B=20)
## Number of significant surrogate variables is: 8
## Iteration (out of 20 ):1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
sva function returns a list of 4 components: 1) sv= matrix with columns corrosponding to estimate of surrogate variables 2) pprob.gam=probability that each gene is associated with one or more latent variables 3) pprob.b=probability that each gene is associated with our variable of interest 4) n.sv=number of surrogate variables
#Note here the column which we care about is not used pheno[c(-?)], Age in 3rd column
pheno_sva=cbind(pheno[c(-3)],svobj$sv)
reglm_sva<-lapply(1:nrow(edata), function(x){lm(unlist(edata[x,])~.,data=pheno_sva)})
DT::datatable(pheno_sva)
residuals<-lapply(reglm_sva, function(x)residuals(summary(x)))
residuals<-do.call(rbind, residuals)
edata_adjresiduals<-residuals+matrix(apply(edata, 1, mean), nrow=nrow(residuals), ncol=ncol(residuals))
rownames(edata_adjresiduals)=rownames(edata)
rownames(residuals)=rownames(edata)
DT::datatable(edata_adjresiduals[1:7,1:7])
dim(edata_adjresiduals)
## [1] 16769 71
write.csv(edata_adjresiduals, "edata_adjresiduals.csv")
#Save .RData and clear environment to free up memory
save.image(file="temp.RData")
rm(list=ls())
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 3857352 206.1 9912070 529.4 NA 9912070 529.4
## Vcells 6766664 51.7 52283984 398.9 102400 60608265 462.5
#This 'reglm_sva' is a big object so we remove it as we have saved the other variables we need
load(file="temp.RData")
rm(reglm_sva)
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 3874792 207.0 13091016 699.2 NA 9912070 529.4
## Vcells 14075309 107.4 51861685 395.7 102400 60608265 462.5
edata #After SVA and LM adjustment edata_adjresiduals
#Summary Statistics edata
DT::datatable(summary(edata[1:7,1:7]))
write.csv(summary(edata),"summary_stats_edata.csv")
#Summary Statistics edata_adjresiduals
DT::datatable(summary(edata_adjresiduals[1:7,1:7]))
write.csv(summary(edata_adjresiduals),"summary_stats_edata_adjresiduals.csv")
pdf(file="Visualization_batch_before_and_after_SVA-LM.pdf",height=10,width=10)
par(mfrow=c(2,2))
#Before Correlation
# Colorbar along the heatmap represents Study or batch
nconditions <- nlevels(as.factor(pheno$Study))
npal <- colorRampPalette(brewer.pal(nconditions, "Set1"))(nconditions)
condition_colors <- npal[as.integer(pheno$Study)]
heatmap.2(cor(edata), RowSideColors=condition_colors,
trace='none', cexRow=0.5, main='Sample correlations before SVA LM Study')
#After Correlation
# Colorbar along the heatmap represents Study
nconditions <- nlevels(as.factor(pheno$Study))
npal <- colorRampPalette(brewer.pal(nconditions, "Set1"))(nconditions)
condition_colors <- npal[as.integer(pheno$Study)]
heatmap.2(cor(edata_adjresiduals), RowSideColors=condition_colors,
trace='none', cexRow=0.5, main='Sample correlations after SVA LM Study')
#Before boxplot
par(mai=c(1,0.8,1,0.8))
boxplot(edata, outline=FALSE, las=2, cex=0.25, main="before SVA LM Study", col="yellow")
#After boxplot
par(mai=c(1,0.8,1,0.8))
boxplot(edata_adjresiduals, outline=FALSE, las=2, cex=0.25, main="after SVA LM Study", col="yellow")
#Before MDSplot, colored by Study
par(mai=c(1,0.8,1,0.8))
plotMDS(edata, col=condition_colors, legend= "all", main="before SVA LM Study", cex=0.5)#like PCA plot
#After MDSplot, colored by Study
par(mai=c(1,0.8,1,0.8))
plotMDS(edata_adjresiduals, col=condition_colors, legend= "all", main="after SVA LM Study", cex=0.5)#like PCA plot
#Before PCAplot, colored by Study or batch
pca0=prcomp(t(edata), center=T, scale=T)
names(pca0)
## [1] "sdev" "rotation" "center" "scale" "x"
#summary(pca0)
pca0$x[1:3,1:3]
## PC1 PC2 PC3
## MF_3_SRR1593496 -62.87600 120.25801 -28.043048
## MF_3_SRR1593497 -72.81853 90.57949 -17.385049
## MF_3_SRR1593498 -74.72239 113.71426 4.161144
plot(x=pca0$x[,1], y=pca0$x[,2],
cex=0,
xlab="PC1", ylab="PC2",
main="PC plot colored by Study or batch before SVA LM",
font=2
)
text(x=pca0$x[,1], y=pca0$x[,2],
labels=pheno$Age,
col=pheno$Study,
cex=0.7, font=2)
legend("bottomright",
legend=c("GSE61915", "GSE73517", "GSE83931"),
text.col=c("red", "blue", "green")
)
#After PCAplot, colored by Study or batch
pca0=prcomp(t(edata_adjresiduals), center=T, scale=T)
names(pca0)
## [1] "sdev" "rotation" "center" "scale" "x"
#summary(pca0)
pca0$x[1:3,1:3]
## PC1 PC2 PC3
## MF_3_SRR1593496 -11.484726 3.801784 -17.7970167
## MF_3_SRR1593497 -9.777392 19.249424 -13.5698688
## MF_3_SRR1593498 1.930436 55.675007 -0.9632216
plot(x=pca0$x[,1], y=pca0$x[,2],
cex=0,
xlab="PC1", ylab="PC2",
main="PC plot colored by Study or batch after SVA LM",
font=2
)
text(x=pca0$x[,1], y=pca0$x[,2],
labels=pheno$Age,
col=pheno$Study,
cex=0.7, font=2)
legend("bottomright",
legend=c("GSE61915", "GSE73517", "GSE83931"),
text.col=c("red", "blue", "green")
)
dev.off()
## quartz_off_screen
## 2
#Export edata_adjustedresidual SVA and LM normalized data for WGCNA
write.csv(edata_adjresiduals, "MsAgeInterest_edata_adjresiduals_GeneID.csv")
#save image
save.image(file="SVALM_temp.RData")
WGCNA top variable 7301 genes same number (not same genes necesarrly) as human genes
#Load required libraries
library(Rcpp)
library(stringi)
library(biomaRt)
#The useMart() function can now be used to connect to a specified BioMart database, this must be a valid name given by listMarts(). In the next example we choose to query the Ensembl BioMart database.
listMarts()
## biomart version
## 1 ENSEMBL_MART_ENSEMBL Ensembl Genes 94
## 2 ENSEMBL_MART_MOUSE Mouse strains 94
## 3 ENSEMBL_MART_SNP Ensembl Variation 94
## 4 ENSEMBL_MART_FUNCGEN Ensembl Regulation 94
Expr_MsGene=read.csv("MsAgeInterest_edata_adjresiduals_GeneID.csv", header =T, sep=',')
ensembl=useMart("ensembl")
listDatasets(ensembl)
## dataset
## 1 acalliptera_gene_ensembl
## 2 acarolinensis_gene_ensembl
## 3 acitrinellus_gene_ensembl
## 4 amelanoleuca_gene_ensembl
## 5 amexicanus_gene_ensembl
## 6 anancymaae_gene_ensembl
## 7 aocellaris_gene_ensembl
## 8 apercula_gene_ensembl
## 9 aplatyrhynchos_gene_ensembl
## 10 apolyacanthus_gene_ensembl
## 11 atestudineus_gene_ensembl
## 12 btaurus_gene_ensembl
## 13 caperea_gene_ensembl
## 14 catys_gene_ensembl
## 15 ccapucinus_gene_ensembl
## 16 cchok1gshd_gene_ensembl
## 17 ccrigri_gene_ensembl
## 18 celegans_gene_ensembl
## 19 cfamiliaris_gene_ensembl
## 20 chircus_gene_ensembl
## 21 choffmanni_gene_ensembl
## 22 cintestinalis_gene_ensembl
## 23 cjacchus_gene_ensembl
## 24 clanigera_gene_ensembl
## 25 cpalliatus_gene_ensembl
## 26 cporcellus_gene_ensembl
## 27 csabaeus_gene_ensembl
## 28 csavignyi_gene_ensembl
## 29 csemilaevis_gene_ensembl
## 30 csyrichta_gene_ensembl
## 31 cvariegatus_gene_ensembl
## 32 dmelanogaster_gene_ensembl
## 33 dnovemcinctus_gene_ensembl
## 34 dordii_gene_ensembl
## 35 drerio_gene_ensembl
## 36 eburgeri_gene_ensembl
## 37 ecaballus_gene_ensembl
## 38 eeuropaeus_gene_ensembl
## 39 elucius_gene_ensembl
## 40 etelfairi_gene_ensembl
## 41 falbicollis_gene_ensembl
## 42 fcatus_gene_ensembl
## 43 fdamarensis_gene_ensembl
## 44 fheteroclitus_gene_ensembl
## 45 gaculeatus_gene_ensembl
## 46 gaffinis_gene_ensembl
## 47 ggallus_gene_ensembl
## 48 ggorilla_gene_ensembl
## 49 gmorhua_gene_ensembl
## 50 hburtoni_gene_ensembl
## 51 hcomes_gene_ensembl
## 52 hfemale_gene_ensembl
## 53 hmale_gene_ensembl
## 54 hsapiens_gene_ensembl
## 55 ipunctatus_gene_ensembl
## 56 itridecemlineatus_gene_ensembl
## 57 jjaculus_gene_ensembl
## 58 kmarmoratus_gene_ensembl
## 59 lafricana_gene_ensembl
## 60 lbergylta_gene_ensembl
## 61 lchalumnae_gene_ensembl
## 62 loculatus_gene_ensembl
## 63 malbus_gene_ensembl
## 64 marmatus_gene_ensembl
## 65 mauratus_gene_ensembl
## 66 mcaroli_gene_ensembl
## 67 mdomestica_gene_ensembl
## 68 mfascicularis_gene_ensembl
## 69 mfuro_gene_ensembl
## 70 mgallopavo_gene_ensembl
## 71 mleucophaeus_gene_ensembl
## 72 mlucifugus_gene_ensembl
## 73 mmola_gene_ensembl
## 74 mmulatta_gene_ensembl
## 75 mmurinus_gene_ensembl
## 76 mmusculus_gene_ensembl
## 77 mnemestrina_gene_ensembl
## 78 mochrogaster_gene_ensembl
## 79 mpahari_gene_ensembl
## 80 mspretus_gene_ensembl
## 81 mzebra_gene_ensembl
## 82 nbrichardi_gene_ensembl
## 83 neugenii_gene_ensembl
## 84 ngalili_gene_ensembl
## 85 nleucogenys_gene_ensembl
## 86 oanatinus_gene_ensembl
## 87 oaries_gene_ensembl
## 88 ocuniculus_gene_ensembl
## 89 odegus_gene_ensembl
## 90 ogarnettii_gene_ensembl
## 91 ohni_gene_ensembl
## 92 ohsok_gene_ensembl
## 93 olatipes_gene_ensembl
## 94 omelastigma_gene_ensembl
## 95 oniloticus_gene_ensembl
## 96 oprinceps_gene_ensembl
## 97 pabelii_gene_ensembl
## 98 paltaica_gene_ensembl
## 99 panubis_gene_ensembl
## 100 pbairdii_gene_ensembl
## 101 pcapensis_gene_ensembl
## 102 pcoquereli_gene_ensembl
## 103 pformosa_gene_ensembl
## 104 pkingsleyae_gene_ensembl
## 105 platipinna_gene_ensembl
## 106 pmagnuspinnatus_gene_ensembl
## 107 pmarinus_gene_ensembl
## 108 pmexicana_gene_ensembl
## 109 pnattereri_gene_ensembl
## 110 pnyererei_gene_ensembl
## 111 ppaniscus_gene_ensembl
## 112 ppardus_gene_ensembl
## 113 preticulata_gene_ensembl
## 114 psinensis_gene_ensembl
## 115 ptroglodytes_gene_ensembl
## 116 pvampyrus_gene_ensembl
## 117 rbieti_gene_ensembl
## 118 rnorvegicus_gene_ensembl
## 119 rroxellana_gene_ensembl
## 120 saraneus_gene_ensembl
## 121 sboliviensis_gene_ensembl
## 122 scerevisiae_gene_ensembl
## 123 sdorsalis_gene_ensembl
## 124 sdumerili_gene_ensembl
## 125 sformosus_gene_ensembl
## 126 sharrisii_gene_ensembl
## 127 smaximus_gene_ensembl
## 128 spartitus_gene_ensembl
## 129 sscrofa_gene_ensembl
## 130 tbelangeri_gene_ensembl
## 131 tguttata_gene_ensembl
## 132 tnigroviridis_gene_ensembl
## 133 trubripes_gene_ensembl
## 134 ttruncatus_gene_ensembl
## 135 vpacos_gene_ensembl
## 136 xcouchianus_gene_ensembl
## 137 xmaculatus_gene_ensembl
## 138 xtropicalis_gene_ensembl
## description
## 1 Eastern happy genes (fAstCal1.2)
## 2 Anole lizard genes (AnoCar2.0)
## 3 Midas cichlid genes (Midas_v5)
## 4 Panda genes (ailMel1)
## 5 Cave fish genes (Astyanax_mexicanus-2.0)
## 6 Ma's night monkey genes (Anan_2.0)
## 7 Clown anemonefish genes (AmpOce1.0)
## 8 Orange clownfish genes (Nemo_v1)
## 9 Duck genes (BGI_duck_1.0)
## 10 Spiny chromis genes (ASM210954v1)
## 11 Climbing perch genes (fAnaTes1.1)
## 12 Cow genes (UMD3.1)
## 13 Brazilian guinea pig genes (CavAp1.0)
## 14 Sooty mangabey genes (Caty_1.0)
## 15 Capuchin genes (Cebus_imitator-1.0)
## 16 Chinese hamster CHOK1GS genes (CHOK1GS_HDv1)
## 17 Chinese hamster CriGri genes (CriGri_1.0)
## 18 Caenorhabditis elegans genes (WBcel235)
## 19 Dog genes (CanFam3.1)
## 20 Goat genes (ARS1)
## 21 Sloth genes (choHof1)
## 22 C.intestinalis genes (KH)
## 23 Marmoset genes (ASM275486v1)
## 24 Long-tailed chinchilla genes (ChiLan1.0)
## 25 Angola colobus genes (Cang.pa_1.0)
## 26 Guinea Pig genes (Cavpor3.0)
## 27 Vervet-AGM genes (ChlSab1.1)
## 28 C.savignyi genes (CSAV 2.0)
## 29 Tongue sole genes (Cse_v1.0)
## 30 Tarsier genes (Tarsius_syrichta-2.0.1)
## 31 Sheepshead minnow genes (C_variegatus-1.0)
## 32 Fruitfly genes (BDGP6)
## 33 Armadillo genes (Dasnov3.0)
## 34 Kangaroo rat genes (Dord_2.0)
## 35 Zebrafish genes (GRCz11)
## 36 Hagfish genes (Eburgeri_3.2)
## 37 Horse genes (Equ Cab 2)
## 38 Hedgehog genes (eriEur1)
## 39 Northern pike genes (Eluc_V3)
## 40 Lesser hedgehog tenrec genes (TENREC)
## 41 Flycatcher genes (FicAlb_1.4)
## 42 Cat genes (Felis_catus_9.0)
## 43 Damara mole rat genes (DMR_v1.0)
## 44 Mummichog genes (Fundulus_heteroclitus-3.0.2)
## 45 Stickleback genes (BROAD S1)
## 46 Western mosquitofish genes (ASM309773v1)
## 47 Chicken genes (Gallus_gallus-5.0)
## 48 Gorilla genes (gorGor4)
## 49 Cod genes (gadMor1)
## 50 Burton's mouthbrooder genes (AstBur1.0)
## 51 Tiger tail seahorse genes (H_comes_QL1_v1)
## 52 Naked mole-rat female genes (HetGla_female_1.0)
## 53 Naked mole-rat male genes (HetGla_1.0)
## 54 Human genes (GRCh38.p12)
## 55 Channel catfish genes (IpCoco_1.2)
## 56 Squirrel genes (SpeTri2.0)
## 57 Lesser Egyptian jerboa genes (JacJac1.0)
## 58 Mangrove rivulus genes (ASM164957v1)
## 59 Elephant genes (Loxafr3.0)
## 60 Ballan wrasse genes (BallGen_V1)
## 61 Coelacanth genes (LatCha1)
## 62 Spotted gar genes (LepOcu1)
## 63 Swamp eel genes (M_albus_1.0)
## 64 Zig-zag eel genes (fMasArm1.1)
## 65 Golden Hamster genes (MesAur1.0)
## 66 Ryukyu mouse genes (CAROLI_EIJ_v1.1)
## 67 Opossum genes (monDom5)
## 68 Crab-eating macaque genes (Macaca_fascicularis_5.0)
## 69 Ferret genes (MusPutFur1.0)
## 70 Turkey genes (Turkey_2.01)
## 71 Drill genes (Mleu.le_1.0)
## 72 Microbat genes (Myoluc2.0)
## 73 Ocean sunfish genes (ASM169857v1)
## 74 Macaque genes (Mmul_8.0.1)
## 75 Mouse Lemur genes (Mmur_3.0)
## 76 Mouse genes (GRCm38.p6)
## 77 Pig-tailed macaque genes (Mnem_1.0)
## 78 Prairie vole genes (MicOch1.0)
## 79 Shrew mouse genes (PAHARI_EIJ_v1.1)
## 80 Algerian mouse genes (SPRET_EiJ_v1)
## 81 Zebra mbuna genes (M_zebra_UMD2a)
## 82 Lyretail cichlid genes (NeoBri1.0)
## 83 Wallaby genes (Meug_1.0)
## 84 Upper Galilee mountains blind mole rat genes (S.galili_v1.0)
## 85 Gibbon genes (Nleu_3.0)
## 86 Platypus genes (OANA5)
## 87 Sheep genes (Oar_v3.1)
## 88 Rabbit genes (OryCun2.0)
## 89 Degu genes (OctDeg1.0)
## 90 Bushbaby genes (OtoGar3)
## 91 Japanese Medaka HNI genes (ASM223471v1)
## 92 Japanese Medaka HSOK genes (ASM223469v1)
## 93 Japanese Medaka HdrR genes (ASM223467v1)
## 94 Indian medaka genes (Om_v0.7.RACA)
## 95 Tilapia genes (Orenil1.0)
## 96 Pika genes (OchPri2.0-Ens)
## 97 Orangutan genes (PPYG2)
## 98 Tiger genes (PanTig1.0)
## 99 Olive baboon genes (Panu_3.0)
## 100 Northern American deer mouse genes (Pman_1.0)
## 101 Hyrax genes (proCap1)
## 102 Coquerel's sifaka genes (Pcoq_1.0)
## 103 Amazon molly genes (Poecilia_formosa-5.1.2)
## 104 Paramormyrops kingsleyae genes (PKINGS_0.1)
## 105 Sailfin molly genes (P_latipinna-1.0)
## 106 Big-finned mudskipper genes (PM.fa)
## 107 Lamprey genes (Pmarinus_7.0)
## 108 Shortfin molly genes (P_mexicana-1.0)
## 109 Red-bellied piranha genes (Pygocentrus_nattereri-1.0.2)
## 110 Pundamilia nyererei genes (PunNye1.0)
## 111 Bonobo genes (panpan1.1)
## 112 Leopard genes (PanPar1.0)
## 113 Guppy genes (Guppy_female_1.0_MT)
## 114 Chinese softshell turtle genes (PelSin_1.0)
## 115 Chimpanzee genes (Pan_tro_3.0)
## 116 Megabat genes (pteVam1)
## 117 Black snub-nosed monkey genes (ASM169854v1)
## 118 Rat genes (Rnor_6.0)
## 119 Golden snub-nosed monkey genes (Rrox_v1)
## 120 Shrew genes (sorAra1)
## 121 Bolivian squirrel monkey genes (SaiBol1.0)
## 122 Saccharomyces cerevisiae genes (R64-1-1)
## 123 Yellowtail amberjack genes (Sedor1)
## 124 Greater amberjack genes (Sdu_1.0)
## 125 Asian bonytongue genes (ASM162426v1)
## 126 Tasmanian devil genes (Devil_ref v7.0)
## 127 Turbot genes (ASM318616v1)
## 128 Bicolor damselfish genes (Stegastes_partitus-1.0.2)
## 129 Pig genes (Sscrofa11.1)
## 130 Tree Shrew genes (tupBel1)
## 131 Zebra Finch genes (taeGut3.2.4)
## 132 Tetraodon genes (TETRAODON 8.0)
## 133 Fugu genes (FUGU5)
## 134 Dolphin genes (turTru1)
## 135 Alpaca genes (vicPac1)
## 136 Monterrey platyfish genes (Xiphophorus_couchianus-4.0.1)
## 137 Platyfish genes (X_maculatus-5.0-male)
## 138 Xenopus genes (JGI 4.2)
## version
## 1 fAstCal1.2
## 2 AnoCar2.0
## 3 Midas_v5
## 4 ailMel1
## 5 Astyanax_mexicanus-2.0
## 6 Anan_2.0
## 7 AmpOce1.0
## 8 Nemo_v1
## 9 BGI_duck_1.0
## 10 ASM210954v1
## 11 fAnaTes1.1
## 12 UMD3.1
## 13 CavAp1.0
## 14 Caty_1.0
## 15 Cebus_imitator-1.0
## 16 CHOK1GS_HDv1
## 17 CriGri_1.0
## 18 WBcel235
## 19 CanFam3.1
## 20 ARS1
## 21 choHof1
## 22 KH
## 23 ASM275486v1
## 24 ChiLan1.0
## 25 Cang.pa_1.0
## 26 Cavpor3.0
## 27 ChlSab1.1
## 28 CSAV 2.0
## 29 Cse_v1.0
## 30 Tarsius_syrichta-2.0.1
## 31 C_variegatus-1.0
## 32 BDGP6
## 33 Dasnov3.0
## 34 Dord_2.0
## 35 GRCz11
## 36 Eburgeri_3.2
## 37 Equ Cab 2
## 38 eriEur1
## 39 Eluc_V3
## 40 TENREC
## 41 FicAlb_1.4
## 42 Felis_catus_9.0
## 43 DMR_v1.0
## 44 Fundulus_heteroclitus-3.0.2
## 45 BROAD S1
## 46 ASM309773v1
## 47 Gallus_gallus-5.0
## 48 gorGor4
## 49 gadMor1
## 50 AstBur1.0
## 51 H_comes_QL1_v1
## 52 HetGla_female_1.0
## 53 HetGla_1.0
## 54 GRCh38.p12
## 55 IpCoco_1.2
## 56 SpeTri2.0
## 57 JacJac1.0
## 58 ASM164957v1
## 59 Loxafr3.0
## 60 BallGen_V1
## 61 LatCha1
## 62 LepOcu1
## 63 M_albus_1.0
## 64 fMasArm1.1
## 65 MesAur1.0
## 66 CAROLI_EIJ_v1.1
## 67 monDom5
## 68 Macaca_fascicularis_5.0
## 69 MusPutFur1.0
## 70 Turkey_2.01
## 71 Mleu.le_1.0
## 72 Myoluc2.0
## 73 ASM169857v1
## 74 Mmul_8.0.1
## 75 Mmur_3.0
## 76 GRCm38.p6
## 77 Mnem_1.0
## 78 MicOch1.0
## 79 PAHARI_EIJ_v1.1
## 80 SPRET_EiJ_v1
## 81 M_zebra_UMD2a
## 82 NeoBri1.0
## 83 Meug_1.0
## 84 S.galili_v1.0
## 85 Nleu_3.0
## 86 OANA5
## 87 Oar_v3.1
## 88 OryCun2.0
## 89 OctDeg1.0
## 90 OtoGar3
## 91 ASM223471v1
## 92 ASM223469v1
## 93 ASM223467v1
## 94 Om_v0.7.RACA
## 95 Orenil1.0
## 96 OchPri2.0-Ens
## 97 PPYG2
## 98 PanTig1.0
## 99 Panu_3.0
## 100 Pman_1.0
## 101 proCap1
## 102 Pcoq_1.0
## 103 Poecilia_formosa-5.1.2
## 104 PKINGS_0.1
## 105 P_latipinna-1.0
## 106 PM.fa
## 107 Pmarinus_7.0
## 108 P_mexicana-1.0
## 109 Pygocentrus_nattereri-1.0.2
## 110 PunNye1.0
## 111 panpan1.1
## 112 PanPar1.0
## 113 Guppy_female_1.0_MT
## 114 PelSin_1.0
## 115 Pan_tro_3.0
## 116 pteVam1
## 117 ASM169854v1
## 118 Rnor_6.0
## 119 Rrox_v1
## 120 sorAra1
## 121 SaiBol1.0
## 122 R64-1-1
## 123 Sedor1
## 124 Sdu_1.0
## 125 ASM162426v1
## 126 Devil_ref v7.0
## 127 ASM318616v1
## 128 Stegastes_partitus-1.0.2
## 129 Sscrofa11.1
## 130 tupBel1
## 131 taeGut3.2.4
## 132 TETRAODON 8.0
## 133 FUGU5
## 134 turTru1
## 135 vicPac1
## 136 Xiphophorus_couchianus-4.0.1
## 137 X_maculatus-5.0-male
## 138 JGI 4.2
mouse = useMart('ENSEMBL_MART_ENSEMBL', dataset = 'mmusculus_gene_ensembl')
human = useMart('ENSEMBL_MART_ENSEMBL', dataset = 'hsapiens_gene_ensembl')
myMap = getLDS(attributes = "mgi_symbol", filters = 'mgi_symbol',
values = Expr_MsGene$X, mart = mouse,
attributesL = c("hgnc_symbol"), martL = human)
#Some genes will be duplicates (HLA), so let's make them unique.
myMap=myMap[!duplicated(myMap$HGNC.symbol), ]
#unique(myMap$HGNC.symbol)
myMap_new<-myMap
names(myMap_new) <- c("X", "HuGene")
#Now, lets merge the human gene symbols with the mouse gene symbol in 'Expr_MsGene' to get a single file with human and mouse gene symbols, and the other data in our input file
Expr_MsGeneHuGene<-merge(Expr_MsGene,myMap_new,by='X')
colnames(Expr_MsGeneHuGene)[colnames(Expr_MsGeneHuGene)=="X"] <- "MsGene"
DT::datatable(Expr_MsGeneHuGene[1:7,1:7])
#Next we want to retain only the human Genes
#Expr_MsGeneHuGene=Expr_MsGeneHuGene_1[!duplicated(Expr_MsGeneHuGene$HuGene), ]
Expr_HuGene=Expr_MsGeneHuGene[,-c(1,73)]
rownames(Expr_HuGene)=Expr_MsGeneHuGene$HuGene
Expr_HuGene=Expr_HuGene[complete.cases(Expr_HuGene), ]
DT::datatable(Expr_HuGene)